from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import pandas as pd
df = pd.read_csv('srep00196-s2.csv',header='infer') #read data
df.head()
df = pd.DataFrame(df)
df['food'] = df['food1'].map(str) + df['food2']
documents = list(df.food)
for i in range(len(documents)):
documents[i] = documents[i].replace('_',' ')
documents[:10]
documents = documents[:500] #memory issue for my laptop
len(documents)
documents[:10]
true_k = 5
vectorizer = TfidfVectorizer(order_centroids, stop_words='english')
X = vectorizer.fit_transform(documents)
model = KMeans(n_clusters=true_k)
model.fit(X)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print "Cluster %d:" % i,
for ind in order_centroids[i, :5]:
print ' %s' % terms[ind],
print
%matplotlib inline
import os # for os.path.basename
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity
To select the best K, we start from 2 to 7 and report the sum of distances and clusters.
ks = []
distance = []
for true_k in range(2,15):
vectorizer = TfidfVectorizer(order_centroids, stop_words='english')
X = vectorizer.fit_transform(documents)
model = KMeans(n_clusters=true_k)
model.fit(X)
print ("Number of clusters : %d" % true_k)
print ("Sum of distances: %d " % model.inertia_)
ks.append(true_k)
distance.append(model.inertia_)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print "Cluster %d:" % i,
for ind in order_centroids[i, :5]:
print ' %s' % terms[ind],
print
dist = 1 - cosine_similarity(X)
MDS()
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist) # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a',4: '#66a61e', 5: '#E24A33',6:'#C4AD66',
7:'#fa8174', 8:'#E8000B', 9: '#B0E0E6', 10: '#7A68A6', 11: '#ccebc4', 12: '#4878CF',13:'#03ED3A'}
#set up cluster names using a dict
cluster_names = {0: 'cluster 1',
1: 'cluster 2',
2: 'cluster 3',
3: 'cluster 4',
4: 'cluster 5',
5: 'cluster 6',
6:'cluster 7',
7: 'cluster 8',
8: 'cluster 9',
9: 'cluster 10',
10: 'cluster 11',
11: 'cluster 12',
12: 'cluster 13',
13: 'cluster 14'}
clusters = model.labels_.tolist()
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=documents))
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=documents))
#group by cluster
groups = df.groupby('label')
# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
label=cluster_names[name], color=cluster_colors[name],
mec='none')
ax.set_aspect('auto')
ax.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
ax.tick_params(\
axis= 'y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelleft='off')
ax.legend(numpoints=1) #show legend with only 1 point
#add label in x,y position with the label as the film title
for i in range(len(df)):
ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)
plt.show() #show the plot
plt.plot(ks, distance,'ro') #plot the distance over the number of samples
plt.show() #show the plot
The best model is 14 with the minimum of total distances of 447
for true_k in range(14,15):
vectorizer = TfidfVectorizer(order_centroids, stop_words='english')
X = vectorizer.fit_transform(documents)
model = KMeans(n_clusters=true_k)
model.fit(X)
print ("Number of clusters : %d" % true_k)
print ("Sum of distances: %d " % model.inertia_)
ks.append(true_k)
distance.append(model.inertia_)
print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print "Cluster %d:" % i,
for ind in order_centroids[i, :5]:
print ' %s' % terms[ind],
print
dist = 1 - cosine_similarity(X)
MDS()
# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
pos = mds.fit_transform(dist) # shape (n_components, n_samples)
xs, ys = pos[:, 0], pos[:, 1]
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#e7298a',4: '#66a61e', 5: '#E24A33',6:'#C4AD66',
7:'#fa8174', 8:'#E8000B', 9: '#B0E0E6', 10: '#7A68A6', 11: '#ccebc4', 12: '#4878CF',13:'#03ED3A'}
#set up cluster names using a dict
cluster_names = {0: 'cluster 1',
1: 'cluster 2',
2: 'cluster 3',
3: 'cluster 4',
4: 'cluster 5',
5: 'cluster 6',
6:'cluster 7',
7: 'cluster 8',
8: 'cluster 9',
9: 'cluster 10',
10: 'cluster 11',
11: 'cluster 12',
12: 'cluster 13',
13: 'cluster 14'}
clusters = model.labels_.tolist()
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=documents))
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=documents))
#group by cluster
groups = df.groupby('label')
# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
ax.plot(group.x, group.y, marker='o', linestyle='', ms=12,
label=cluster_names[name], color=cluster_colors[name],
mec='none')
ax.set_aspect('auto')
ax.tick_params(\
axis= 'x', # changes apply to the x-axis
which='both', # both major and minor ticks are affected
bottom='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelbottom='off')
ax.tick_params(\
axis= 'y', # changes apply to the y-axis
which='both', # both major and minor ticks are affected
left='off', # ticks along the bottom edge are off
top='off', # ticks along the top edge are off
labelleft='off')
ax.legend(numpoints=1) #show legend with only 1 point
#add label in x,y position with the label as the film title
for i in range(len(df)):
ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)
plt.show() #show the plot